Importing Libraries¶

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn. metrics import mean_absolute_error, mean_squared_error, r2_score

loading the dataset¶

In [2]:
df = pd.read_csv('train.csv')
In [3]:
df.head()
Out[3]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
0 1000001 P00069042 F 0-17 10 A 2 0 3 NaN NaN 8370
1 1000001 P00248942 F 0-17 10 A 2 0 1 6.0 14.0 15200
2 1000001 P00087842 F 0-17 10 A 2 0 12 NaN NaN 1422
3 1000001 P00085442 F 0-17 10 A 2 0 12 14.0 NaN 1057
4 1000002 P00285442 M 55+ 16 C 4+ 0 8 NaN NaN 7969

Data Understanding¶

In [4]:
df.shape
Out[4]:
(550068, 12)
In [5]:
df.describe()
Out[5]:
User_ID Occupation Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
count 5.500680e+05 550068.000000 550068.000000 550068.000000 376430.000000 166821.000000 550068.000000
mean 1.003029e+06 8.076707 0.409653 5.404270 9.842329 12.668243 9263.968713
std 1.727592e+03 6.522660 0.491770 3.936211 5.086590 4.125338 5023.065394
min 1.000001e+06 0.000000 0.000000 1.000000 2.000000 3.000000 12.000000
25% 1.001516e+06 2.000000 0.000000 1.000000 5.000000 9.000000 5823.000000
50% 1.003077e+06 7.000000 0.000000 5.000000 9.000000 14.000000 8047.000000
75% 1.004478e+06 14.000000 1.000000 8.000000 15.000000 16.000000 12054.000000
max 1.006040e+06 20.000000 1.000000 20.000000 18.000000 18.000000 23961.000000
In [6]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     550068 non-null  int64  
 1   Product_ID                  550068 non-null  object 
 2   Gender                      550068 non-null  object 
 3   Age                         550068 non-null  object 
 4   Occupation                  550068 non-null  int64  
 5   City_Category               550068 non-null  object 
 6   Stay_In_Current_City_Years  550068 non-null  object 
 7   Marital_Status              550068 non-null  int64  
 8   Product_Category_1          550068 non-null  int64  
 9   Product_Category_2          376430 non-null  float64
 10  Product_Category_3          166821 non-null  float64
 11  Purchase                    550068 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB
In [7]:
df.isna().sum()
# there is 173638 missing values in Product_Category_2 and 383247 missing values in Product_Category_3
Out[7]:
User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173638
Product_Category_3            383247
Purchase                           0
dtype: int64

Data Visualization¶

In [8]:
df.head()
Out[8]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
0 1000001 P00069042 F 0-17 10 A 2 0 3 NaN NaN 8370
1 1000001 P00248942 F 0-17 10 A 2 0 1 6.0 14.0 15200
2 1000001 P00087842 F 0-17 10 A 2 0 12 NaN NaN 1422
3 1000001 P00085442 F 0-17 10 A 2 0 12 14.0 NaN 1057
4 1000002 P00285442 M 55+ 16 C 4+ 0 8 NaN NaN 7969
In [9]:
plt.figure(figsize = (8, 6))
sns.countplot(data = df, x = 'Gender')
Out[9]:
<Axes: xlabel='Gender', ylabel='count'>
No description has been provided for this image

count of Male gender is higher as compated to the Female

In [10]:
plt.figure(figsize=(8,6))
# creating barplot of Gender and purchase
sns.barplot(data = df, x = "Gender", y = "Purchase")
Out[10]:
<Axes: xlabel='Gender', ylabel='Purchase'>
No description has been provided for this image

Higher purchase have been done by male gender as compared to the female

In [11]:
sns.barplot(y = 'Purchase', x= 'Occupation', data = df)
Out[11]:
<Axes: xlabel='Occupation', ylabel='Purchase'>
No description has been provided for this image

Occupation 7, 12, 14, 17 has higher purchase

In [12]:
plt.figure(figsize = (10,6))
sns.barplot(x = "Occupation", y ="Purchase" , hue = "Gender", data = df)
Out[12]:
<Axes: xlabel='Occupation', ylabel='Purchase'>
No description has been provided for this image

Outliers Detection¶

In [13]:
sns.boxplot(data = df , x = "Gender", y = "Purchase")
Out[13]:
<Axes: xlabel='Gender', ylabel='Purchase'>
No description has been provided for this image
In [14]:
sns.catplot(data = df.sort_values('Purchase', ascending=False), kind = 'boxen', height = 7, aspect = 3 , x = "Gender", y = "Purchase")
D:\Projects\analysis\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)
Out[14]:
<seaborn.axisgrid.FacetGrid at 0x2250c03c5d0>
No description has been provided for this image
In [15]:
sns.boxplot(data =df, x = "Occupation", y = "Purchase")
Out[15]:
<Axes: xlabel='Occupation', ylabel='Purchase'>
No description has been provided for this image

This column has outliers which may effect the performance of the machine learning models

In [16]:
sns.boxplot(data = df,x = "Age", y = "Purchase")
Out[16]:
<Axes: xlabel='Age', ylabel='Purchase'>
No description has been provided for this image

This column also have some outliers

In [17]:
sns.boxplot(data =df , x = 'Product_Category_1', y = "Purchase")
Out[17]:
<Axes: xlabel='Product_Category_1', ylabel='Purchase'>
No description has been provided for this image

This column has outliers

Data Preprocessing¶

In [18]:
df.head()
Out[18]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
0 1000001 P00069042 F 0-17 10 A 2 0 3 NaN NaN 8370
1 1000001 P00248942 F 0-17 10 A 2 0 1 6.0 14.0 15200
2 1000001 P00087842 F 0-17 10 A 2 0 12 NaN NaN 1422
3 1000001 P00085442 F 0-17 10 A 2 0 12 14.0 NaN 1057
4 1000002 P00285442 M 55+ 16 C 4+ 0 8 NaN NaN 7969
In [19]:
df['Product_ID'] = df['Product_ID'].str.replace('P00', '') # replacing P00 with blank space
In [20]:
ss = StandardScaler()
In [21]:
df['Product_ID'] = ss.fit_transform(df['Product_ID'].values.reshape(-1,1))
In [22]:
df.drop(['Product_Category_3'], axis =1, inplace = True) # max no of missing values so we drop this col
In [23]:
df['Product_Category_2']= df['Product_Category_2'].fillna(df['Product_Category_2'].mean()) # filling nan values with the mean
In [24]:
df.isna().sum()
Out[24]:
User_ID                       0
Product_ID                    0
Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Product_Category_2            0
Purchase                      0
dtype: int64
In [25]:
df.head()
Out[25]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Purchase
0 1000001 -1.028774 F 0-17 10 A 2 0 3 9.842329 8370
1 1000001 0.722139 F 0-17 10 A 2 0 1 6.000000 15200
2 1000001 -0.845799 F 0-17 10 A 2 0 12 9.842329 1422
3 1000001 -0.869157 F 0-17 10 A 2 0 12 14.000000 1057
4 1000002 1.077382 M 55+ 16 C 4+ 0 8 9.842329 7969

Label Encoding¶

In [26]:
categorical_columns = ['Gender', 'City_Category', 'Age']
le = LabelEncoder()
for i in categorical_columns:
    df[i] = le.fit_transform(df[i])
df.dtypes
Out[26]:
User_ID                         int64
Product_ID                    float64
Gender                          int32
Age                             int32
Occupation                      int64
City_Category                   int32
Stay_In_Current_City_Years     object
Marital_Status                  int64
Product_Category_1              int64
Product_Category_2            float64
Purchase                        int64
dtype: object
In [27]:
df['Stay_In_Current_City_Years'] = df['Stay_In_Current_City_Years'].replace('4+', '4')
In [28]:
# changing the datatypes to integer
df['Gender'] = df['Gender'].astype(int)
df['Age'] = df['Age'].astype(int)
df['Stay_In_Current_City_Years'] = df['Stay_In_Current_City_Years'].astype(int)
df['City_Category'] = df['City_Category'].astype('category')
In [29]:
 
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[29], line 1
----> 1 d

NameError: name 'd' is not defined
In [30]:
df.head()
Out[30]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Purchase
0 1000001 -1.028774 0 0 10 0 2 0 3 9.842329 8370
1 1000001 0.722139 0 0 10 0 2 0 1 6.000000 15200
2 1000001 -0.845799 0 0 10 0 2 0 12 9.842329 1422
3 1000001 -0.869157 0 0 10 0 2 0 12 14.000000 1057
4 1000002 1.077382 1 6 16 2 4 0 8 9.842329 7969
In [31]:
### Distribution Plots
In [32]:
rows = 3
cols = 3
fig, ax = plt.subplots(nrows=rows, ncols=cols, figsize=(10, 4))
col = df.columns
index = 2

for i in range(rows):
    for j in range(cols):
        if index < len(col):
            sns.histplot(df[col[index]], ax=ax[i][j], kde=True)  # You can use sns.kdeplot() for KDE plots
            ax[i][j].set_title(col[index])
            index += 1
        else:
            ax[i][j].axis("off")  # Turn off empty subplots if there are fewer columns than rows*cols

plt.tight_layout()
plt.show()
No description has been provided for this image
In [33]:
df['Purchase'] =np.log(df['Purchase']) # transfer data into normal distribution
In [34]:
df = pd.get_dummies(df)
# used to convert categorical variable into dummy/indicator variables
df.head()
Out[34]:
User_ID Product_ID Gender Age Occupation Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Purchase City_Category_0 City_Category_1 City_Category_2
0 1000001 -1.028774 0 0 10 2 0 3 9.842329 9.032409 True False False
1 1000001 0.722139 0 0 10 2 0 1 6.000000 9.629051 True False False
2 1000001 -0.845799 0 0 10 2 0 12 9.842329 7.259820 True False False
3 1000001 -0.869157 0 0 10 2 0 12 14.000000 6.963190 True False False
4 1000002 1.077382 1 6 16 4 0 8 9.842329 8.983314 False False True

Machine Learning Part¶

In [35]:
X = df.drop(labels = ['Purchase'], axis =1)
y = df['Purchase']
In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# the data is split into 80 percent train size and 20 percent test size
In [37]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

Linear Regression¶

In [38]:
lr = LinearRegression()
lr.fit(X_train, y_train)
Out[38]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [39]:
y_predict = lr.predict(X_test)
## predicting on X_test
print('r2_score:', r2_score(y_test, y_predict))
print('mean_absolute_error:', mean_absolute_error(y_test, y_predict))
print('mean_squared_error:', mean_squared_error(y_test, y_predict))
print('root_mean_squared_error',np.sqrt(mean_squared_error(y_test, y_predict)))
r2_score: 0.20164239829578356
mean_absolute_error: 0.45565817118315044
mean_squared_error: 0.44379631133591096
root_mean_squared_error 0.6661803894861443
In [40]:
# r2 score is 0.20 and other are higher and our model is not performing well this mean model is not very accurate to predict the purchase or the target columns

Decision Tree Regression¶

In [41]:
dtr = DecisionTreeRegressor(max_depth=9)
dtr.fit(X_train, y_train)
Out[41]:
DecisionTreeRegressor(max_depth=9)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor(max_depth=9)
In [42]:
#predicting trrain
train_predict = dtr.predict(X_train)
#predicting test
test_predict=dtr.predict(X_test)
In [43]:
rmse_train = np.sqrt(metrics.mean_squared_error(y_train, train_predict))
rmse_test = np.sqrt(metrics.mean_squared_error(y_test, test_predict))
print('RMSE score for Training Data:',  str(rmse_train))
print('RMSE score for Test Data:', str(rmse_test))
print("*"*20)
print('r2 score for train:', dtr.score(X_train, y_train))
print('r2 score for test:', dtr.score(X_test, y_test))
RMSE score for Training Data: 0.3680408214406253
RMSE score for Test Data: 0.3689567100682491
********************
r2 score for train: 0.7519510621944241
r2 score for test: 0.7551136360952996

Random Forest Regression¶

In [44]:
Rf = RandomForestRegressor()
In [45]:
Rf.fit(X_train, y_train)
Out[45]:
RandomForestRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor()
In [46]:
# predicting train data
rf_train_predict = Rf.predict(X_train)
# predicting test data
rf_test_predict = Rf.predict(X_test)
In [47]:
print(rf_train_predict)
[8.94589438 9.17581003 8.13538297 ... 8.84825964 9.3764006  9.16917216]
In [48]:
print(rf_test_predict)
[9.2016627  6.95881267 9.70558683 ... 9.70330271 9.79481092 9.5949654 ]
In [49]:
rmse_training = (np.sqrt(metrics.mean_squared_error(y_train, rf_train_predict)))
rmse_test = (np.sqrt(metrics.mean_squared_error(y_test, rf_test_predict)))
print('RMSE for training data:', rmse_training)
print('RMSE for test data:', rmse_test)
print('*'*50)
print('Rsquared value on training data:', Rf.score(X_train, y_train))
print('Rsquared value on test data:', Rf.score(X_test, y_test))
RMSE for training data: 0.13153861584112314
RMSE for test data: 0.34961990142644944
**************************************************
Rsquared value on training data: 0.9683151304061405
Rsquared value on test data: 0.7801097015706281
In [50]:
# Random Foresr regressor model is better than linear regression and decision tree regression, 
# as we have a low root mean square value and the higher rsquared value
In [51]:
df_test = pd.read_csv('test.csv')
df_test.head()
Out[51]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3
0 1000004 P00128942 M 46-50 7 B 2 1 1 11.0 NaN
1 1000009 P00113442 M 26-35 17 C 0 0 3 5.0 NaN
2 1000010 P00288442 F 36-45 1 B 4+ 1 5 14.0 NaN
3 1000010 P00145342 F 36-45 1 B 4+ 1 4 9.0 NaN
4 1000011 P00053842 F 26-35 1 C 1 0 4 5.0 12.0
In [52]:
df_test.isna().sum()
Out[52]:
User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2             72344
Product_Category_3            162562
dtype: int64
In [53]:
df_test['Product_ID'] = df_test['Product_ID'].str.replace('P00', '')
ss = StandardScaler()
df_test['Product_ID'] = ss.fit_transform(df_test['Product_ID'].values.reshape(-1,1))
In [54]:
df_test.drop(['Product_Category_3'], axis = 1, inplace = True)
In [55]:
df_test['Product_Category_2'] = df_test['Product_Category_2'].fillna(df_test['Product_Category_2'].mean())
In [56]:
df_test['Stay_In_Current_City_Years'] = df_test['Stay_In_Current_City_Years'].replace('4+', '4')
In [57]:
df_test.isna().sum()
Out[57]:
User_ID                       0
Product_ID                    0
Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Product_Category_2            0
dtype: int64
In [58]:
df_test.head()
Out[58]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2
0 1000004 -0.434752 M 46-50 7 B 2 1 1 11.0
1 1000009 -0.587188 M 26-35 17 C 0 0 3 5.0
2 1000010 1.133865 F 36-45 1 B 4 1 5 14.0
3 1000010 -0.273465 F 36-45 1 B 4 1 4 9.0
4 1000011 -1.173330 F 26-35 1 C 1 0 4 5.0
In [59]:
cat_cols = ['Gender', 'Age', 'City_Category']
le = LabelEncoder()
for i in cat_cols:
    df_test[i] = le.fit_transform(df_test[i]) 
In [60]:
df_test.dtypes
Out[60]:
User_ID                         int64
Product_ID                    float64
Gender                          int32
Age                             int32
Occupation                      int64
City_Category                   int32
Stay_In_Current_City_Years     object
Marital_Status                  int64
Product_Category_1              int64
Product_Category_2            float64
dtype: object
In [61]:
df_test['Gender'] = df_test['Gender'].astype(int)
df_test['Age'] = df_test['Age'].astype(int)
df_test['Stay_In_Current_City_Years'] = df_test['Stay_In_Current_City_Years'].astype(int)
df_test['City_Category'] = df_test['City_Category'].astype('category')
In [62]:
df_test = pd.get_dummies(df_test)
In [63]:
df_test.head()
Out[63]:
User_ID Product_ID Gender Age Occupation Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 City_Category_0 City_Category_1 City_Category_2
0 1000004 -0.434752 1 4 7 2 1 1 11.0 False True False
1 1000009 -0.587188 1 2 17 0 0 3 5.0 False False True
2 1000010 1.133865 0 3 1 4 1 5 14.0 False True False
3 1000010 -0.273465 0 3 1 4 1 4 9.0 False True False
4 1000011 -1.173330 0 2 1 1 0 4 5.0 False False True
In [64]:
df.shape
Out[64]:
(550068, 13)
In [65]:
df_test.shape #we have to predict the column purchase so there is difference in columns
Out[65]:
(233599, 12)
In [66]:
df
Out[66]:
User_ID Product_ID Gender Age Occupation Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Purchase City_Category_0 City_Category_1 City_Category_2
0 1000001 -1.028774 0 0 10 2 0 3 9.842329 9.032409 True False False
1 1000001 0.722139 0 0 10 2 0 1 6.000000 9.629051 True False False
2 1000001 -0.845799 0 0 10 2 0 12 9.842329 7.259820 True False False
3 1000001 -0.869157 0 0 10 2 0 12 14.000000 6.963190 True False False
4 1000002 1.077382 1 6 16 4 0 8 9.842329 8.983314 False False True
... ... ... ... ... ... ... ... ... ... ... ... ... ...
550063 1006033 1.924156 1 5 13 1 1 20 9.842329 5.908083 False True False
550064 1006035 1.953267 0 2 1 3 0 20 9.842329 5.916202 False False True
550065 1006036 1.953267 0 2 15 4 1 20 9.842329 4.919981 False True False
550066 1006038 1.953267 0 6 1 2 0 20 9.842329 5.899897 False False True
550067 1006039 1.916360 0 4 0 4 1 20 9.842329 6.194405 False True False

550068 rows × 13 columns

In [67]:
df_test
Out[67]:
User_ID Product_ID Gender Age Occupation Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 City_Category_0 City_Category_1 City_Category_2
0 1000004 -0.434752 1 4 7 2 1 1 11.000000 False True False
1 1000009 -0.587188 1 2 17 0 0 3 5.000000 False False True
2 1000010 1.133865 0 3 1 4 1 5 14.000000 False True False
3 1000010 -0.273465 0 3 1 4 1 4 9.000000 False True False
4 1000011 -1.173330 0 2 1 1 0 4 5.000000 False False True
... ... ... ... ... ... ... ... ... ... ... ... ...
233594 1006036 -0.533098 0 2 15 4 1 8 9.849586 False True False
233595 1006036 0.801456 0 2 15 4 1 5 8.000000 False True False
233596 1006036 -1.389691 0 2 15 4 1 1 5.000000 False True False
233597 1006037 -0.476058 0 4 1 4 0 10 16.000000 False False True
233598 1006039 1.411200 0 4 0 4 1 4 5.000000 False True False

233599 rows × 12 columns

In [68]:
test_pred = Rf.predict(df_test)
len(test_pred)
D:\Projects\analysis\Lib\site-packages\sklearn\base.py:457: UserWarning: X has feature names, but RandomForestRegressor was fitted without feature names
  warnings.warn(
Out[68]:
233599

Selecting Random Forest Regressor to predict on out test dataset¶

In [69]:
frame = pd.read_csv('test.csv')
frame_info = frame[['User_ID', 'Product_ID', 'Gender', 'Occupation']]
frame_info.head()
Out[69]:
User_ID Product_ID Gender Occupation
0 1000004 P00128942 M 7
1 1000009 P00113442 M 17
2 1000010 P00288442 F 1
3 1000010 P00145342 F 1
4 1000011 P00053842 F 1
In [70]:
# creating dataframe of prediction 
prediction = pd.DataFrame(test_pred, columns=['Purchase'])
prediction['User_ID'] = frame_info['User_ID']
prediction['Product_ID'] = frame_info['Product_ID']
prediction['Gender'] = frame_info['Gender']
prediction['Occupation'] = frame_info['Occupation']


prediction.head()
Out[70]:
Purchase User_ID Product_ID Gender Occupation
0 9.553715 1000004 P00128942 M 7
1 9.609040 1000009 P00113442 M 17
2 4.515243 1000010 P00288442 F 1
3 4.515243 1000010 P00145342 F 1
4 4.686747 1000011 P00053842 F 1
In [76]:
import plotly.express as px
px.histogram(prediction, x = 'Gender', y = 'Purchase')
In [73]:
# converting the prediction into csv
prediction.to_csv('BlackFridayPrediction.csv', index = False)
In [ ]: